{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "module_path = os.path.abspath(os.path.join('..'))\n",
    "sys.path.append(module_path)\n",
    "\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "sns.set_style('whitegrid')\n",
    "color = sns.color_palette()\n",
    "%matplotlib inline\n",
    "matplotlib.style.use('ggplot')\n",
    "\n",
    "import time\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from IPython.display import display\n",
    "\n",
    "# remove warnings\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "import xgboost as xgb\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import auc, roc_curve\n",
    "from itertools import product\n",
    "\n",
    "from models.get_datasets import load_stacking_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "load stacking features\n"
     ]
    }
   ],
   "source": [
    "train, test = load_stacking_datasets()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "特征数 350\n"
     ]
    }
   ],
   "source": [
    "X_train = train.drop(['Id','Score'], axis = 1)\n",
    "X_test  = test.drop(['Id'], axis = 1)\n",
    "\n",
    "y_train = train['Score']\n",
    "\n",
    "df_columns = X_train.columns\n",
    "print('特征数 %d' % len(df_columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtrain = xgb.DMatrix(X_train.values, y_train.values, feature_names=df_columns)\n",
    "dtest = xgb.DMatrix(X_test.values, feature_names=df_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import auc, roc_curve\n",
    "from itertools import product\n",
    "\n",
    "def model_cross_validate(xgb_params, cv_param_dict, dtrain, cv_num_boost_round=4000, early_stopping_rounds=100, cv_nfold=5, stratified=True):\n",
    "    params_value = []\n",
    "    params_name = [k for k, w in cv_param_dict.items()]\n",
    "    max_auc = 0\n",
    "    for param in params_name:\n",
    "        params_value.append(cv_param_dict[param])\n",
    "\n",
    "    for param_pair in product(*params_value):\n",
    "        param_str = ''\n",
    "        for i in range(len(param_pair)):\n",
    "            param_str += params_name[i] + '=' + str(param_pair[i]) + ' '\n",
    "            xgb_params[params_name[i]] = param_pair[i]\n",
    "        \n",
    "        start = time.time()\n",
    "        cv_result = xgb.cv(xgb_params, dtrain, num_boost_round=cv_num_boost_round, stratified=stratified,\n",
    "                           nfold=cv_nfold, early_stopping_rounds=early_stopping_rounds)\n",
    "        \n",
    "        best_num_boost_rounds = len(cv_result)\n",
    "        mean_test_auc = cv_result.loc[best_num_boost_rounds - 6: best_num_boost_rounds - 1, 'test-rmse-mean'].mean()\n",
    "        mean_test_auc = 1 / (1 + mean_test_auc)\n",
    "        \n",
    "        if mean_test_auc > max_auc:\n",
    "            best_param = param_pair\n",
    "            max_auc = mean_test_auc\n",
    "        \n",
    "        end = time.time()\n",
    "        print('Tuning paramter: {}, best_ntree_limit:{}, auc = {:.7f}, cost: {}s'.format(param_str, best_num_boost_rounds,\n",
    "                                                                              mean_test_auc, end-start))\n",
    "    param_str = ''\n",
    "    for i in range(len(best_param)):\n",
    "        param_str += params_name[i] + '=' + str(best_param[i]) + ' '\n",
    "        xgb_params[params_name[i]] = best_param[i]\n",
    "    print('===========best paramter: {} auc={:.7f}==========='.format(param_str, max_auc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_params = {\n",
    "    'eta': 0.1,\n",
    "    'min_child_weight': 1,\n",
    "    'colsample_bytree': 0.8,\n",
    "    'max_depth': 5,\n",
    "    'subsample': 0.8,\n",
    "    'gamma': 0,\n",
    "    'scale_pos_weight': 1,\n",
    "    'eval_metric': 'rmse',\n",
    "    'objective': 'reg:linear',\n",
    "    'seed':2018,\n",
    "    'silent': 1,\n",
    "    'booster': 'gbtree'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "---> calc baseline model\n",
      "mean_train_auc = 0.6361679 , mean_test_rmse = 0.6220402\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print('---> calc baseline model')\n",
    "\n",
    "cv_num_boost_round=5000\n",
    "early_stopping_rounds=100\n",
    "cv_nfold=5\n",
    "stratified=True\n",
    "\n",
    "cv_result = xgb.cv(xgb_params,\n",
    "                   dtrain,\n",
    "                   nfold=cv_nfold,\n",
    "                   stratified=stratified,\n",
    "                   num_boost_round=cv_num_boost_round,\n",
    "                   early_stopping_rounds=early_stopping_rounds,\n",
    "                   )\n",
    "\n",
    "best_num_boost_rounds = len(cv_result)\n",
    "mean_train_auc = cv_result.loc[best_num_boost_rounds-6 : best_num_boost_rounds-1, 'train-rmse-mean'].mean()\n",
    "mean_test_auc = cv_result.loc[best_num_boost_rounds-6 : best_num_boost_rounds-1, 'test-rmse-mean'].mean()\n",
    "\n",
    "mean_train_auc = 1 / (1 + mean_train_auc)\n",
    "mean_test_auc = 1 / (1 + mean_test_auc)\n",
    "\n",
    "print('mean_train_auc = {:.7f} , mean_test_rmse = {:.7f}\\n'.format(mean_train_auc, mean_test_auc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tuning paramter: max_depth=5 min_child_weight=1 , best_ntree_limit:120, auc = 0.6220402, cost: 336.6737847328186s\n",
      "Tuning paramter: max_depth=5 min_child_weight=3 , best_ntree_limit:173, auc = 0.6220597, cost: 409.0248680114746s\n",
      "Tuning paramter: max_depth=5 min_child_weight=5 , best_ntree_limit:73, auc = 0.6220327, cost: 314.0649356842041s\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-19-85a27bea63e3>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mcv_paramters\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;34m'max_depth'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m15\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'min_child_weight'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmodel_cross_validate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mxgb_params\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcv_paramters\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtrain\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-18-d6cfbe3b8877>\u001b[0m in \u001b[0;36mmodel_cross_validate\u001b[1;34m(xgb_params, cv_param_dict, dtrain, cv_num_boost_round, early_stopping_rounds, cv_nfold, stratified)\u001b[0m\n\u001b[0;32m     19\u001b[0m         \u001b[0mstart\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     20\u001b[0m         cv_result = xgb.cv(xgb_params, dtrain, num_boost_round=cv_num_boost_round, stratified=stratified,\n\u001b[1;32m---> 21\u001b[1;33m                            nfold=cv_nfold, early_stopping_rounds=early_stopping_rounds)\n\u001b[0m\u001b[0;32m     22\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     23\u001b[0m         \u001b[0mbest_num_boost_rounds\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcv_result\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\users\\demonsong\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\xgboost-0.7-py3.6.egg\\xgboost\\training.py\u001b[0m in \u001b[0;36mcv\u001b[1;34m(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks, shuffle)\u001b[0m\n\u001b[0;32m    404\u001b[0m                            evaluation_result_list=None))\n\u001b[0;32m    405\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mfold\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcvfolds\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 406\u001b[1;33m             \u001b[0mfold\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    407\u001b[0m         \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0maggcv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0meval\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeval\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcvfolds\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    408\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\users\\demonsong\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\xgboost-0.7-py3.6.egg\\xgboost\\training.py\u001b[0m in \u001b[0;36mupdate\u001b[1;34m(self, iteration, fobj)\u001b[0m\n\u001b[0;32m    216\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    217\u001b[0m         \u001b[1;34m\"\"\"\"Update the boosters for one iteration\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 218\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbst\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    219\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    220\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0meval\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeval\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\users\\demonsong\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\xgboost-0.7-py3.6.egg\\xgboost\\core.py\u001b[0m in \u001b[0;36mupdate\u001b[1;34m(self, dtrain, iteration, fobj)\u001b[0m\n\u001b[0;32m    893\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mfobj\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    894\u001b[0m             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),\n\u001b[1;32m--> 895\u001b[1;33m                                                     dtrain.handle))\n\u001b[0m\u001b[0;32m    896\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    897\u001b[0m             \u001b[0mpred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "cv_paramters = {'max_depth':range(5,15,2),'min_child_weight':range(1,10,2)}\n",
    "model_cross_validate(xgb_params, cv_paramters, dtrain)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
